library(gdata)
setwd("/Users/brucedesmarais/Desktop/Research/SenPE/Data/")
library(foreign)
sendat <- read.dta("senator_data.dta")

dat <- list()
files <- dir()[2:13]

for(i in 1:length(files)){
	dat[[i]] <- read.xls(files[i],stringsAsFactors=F, na.strings=c("NA",""))
}

## Functions to Process Names ##

sname <- function(chars){
	space <- which(chars==" ")
	if(length(space)>0) chars <- chars[-space]
	com <- which(chars==",")[1]
	if(is.na(com)){
		return(toupper(paste(chars,collapse="")))
		}
	if(!is.na(com)){
	chars[com] <- "_"
	return(toupper(paste(chars[1:(com+1)],collapse="")))
	}
	}

snames <- function(x){
	x <- strsplit(x,split="")
	unlist(lapply(x,sname))
	}
	
snamenc <- function(chars){
	space <- which(chars==" ")
	if(length(space)>0) chars <- chars[-space]
	com <- which(chars==",")[1]
	if(is.na(com)){
		return(toupper(paste(chars,collapse="")))
		}
	if(!is.na(com)){
	chars[com] <- "_"
	return(toupper(paste(chars[1:(com-1)],collapse="")))
	}
	}

snamesnc <- function(x){
	x <- strsplit(x,split="")
	unlist(lapply(x,snamenc))
	}
	
snamenu <- function(chars){
	space <- which(chars==" ")
	if(length(space)>0) chars <- chars[-space]
	com <- which(chars=="_")[1]
	if(is.na(com)){
		return(toupper(paste(chars,collapse="")))
		}
	if(!is.na(com)){
	chars[com] <- "_"
	return(toupper(paste(chars[1:(com-1)],collapse="")))
	}
	}

snamesnu <- function(x){
	x <- strsplit(x,split="")
	unlist(lapply(x,snamenu))
	}	

sn_i <- list()
senators <- NULL
sn_inc <- list()
years <- NULL
for(i in 1:length(dat)){
	sens <- snames(na.omit(unique(c(as.matrix(dat[[i]][,paste("s",1:15,sep="")])))))
	senators <- sort(unique(c(senators,sens)))
	years <- rbind(years,cbind(na.omit(unique(dat[[i]]$year)),i))
	sn_i[[i]] <- sort(sens)
}
years <- years[order(years[,1]),]

congs <- c(108:110,96:105)

years <- cbind(years,congs[years[,2]])

q <- list(c(1,2,3),c(4,5,6),c(7,8,9),c(10,11,12))

qnets <- list()
empty <- matrix(0,length(senators),length(senators))
for(i in 1:nrow(years)){
	k <- years[,2][i]
	dati <- dat[[k]]
	amats <- list()
	for(t in 1:4){
		amat <- empty
		datit <- subset(dati,is.element(dati$month,q[[t]]))
		sens <- as.matrix(datit[,paste("s",1:15,sep="")])
		srows <- which(apply(is.na(sens),1,sum) < 15)
		for(j in srows){
			svec <- snames(na.omit(sens[j,]))
			if(length(svec)>1){
			ind <- match(svec,senators)
			amat[ind,ind] <- amat[ind,ind] +1/(length(svec)-1)
			}
		}
		diag(amat) <- 0
		amats[[t]] <- amat
	}
	qnets[[i]] <- amats
}

a <- list(c(1,2,3,4,5,6,7,8,9,10,11,12))

anets <- list()
empty <- matrix(0,length(senators),length(senators))
for(i in 1:nrow(years)){
	k <- years[,2][i]
	dati <- dat[[k]]
	amats <- list()
	for(t in 1){
		amat <- empty
		datit <- subset(dati,is.element(dati$month,a[[t]]))
		sens <- as.matrix(datit[,paste("s",1:15,sep="")])
		srows <- which(apply(is.na(sens),1,sum) < 15)
		for(j in srows){
			svec <- snames(na.omit(sens[j,]))
			if(length(svec)>1){
			ind <- match(svec,senators)
			amat[ind,ind] <- amat[ind,ind] +1/(length(svec)-1)
			}
		}
		diag(amat) <- 0
		amats[[t]] <- amat
	}
	anets[[i]] <- amats
}

a <- list(c(1,2,3,4,5,6,7,8,9,10,11,12))

cnets <- list()
empty <- matrix(0,length(senators),length(senators))
for(i in 1:13){
	k <- i
	dati <- dat[[k]]
	amats <- list()
	for(t in 1){
		amat <- empty
		datit <- subset(dati,is.element(dati$month,a[[t]]))
		sens <- as.matrix(datit[,paste("s",1:15,sep="")])
		srows <- which(apply(is.na(sens),1,sum) < 15)
		for(j in srows){
			svec <- snames(na.omit(sens[j,]))
			if(length(svec)>1){
			ind <- match(svec,senators)
			amat[ind,ind] <- amat[ind,ind] +1/(length(svec)-1)
			}
		}
		diag(amat) <- 0
		amats[[t]] <- amat
	}
	cnets[[i]] <- amat
}


#Fowler's Data#

fowler <- read.csv("SH.csv",stringsAsFactors=F)
fowler <- subset(fowler,fowler[,2]=="S")

## Plot all Connected Nodes ##

for(i in 1:nrow(years)){
	for(j in 1:4){
		netij <- nets[[i]][[j]]
		if(sum(netij)>0){
		set.seed(5)
		fileij <- paste("./Plots/everything/y",years[i,1],"q",j,".pdf",sep="")
		pdf(fileij,height=4, width=4, family="Times", pointsize=7)
		gplot(netij,displayisolates=F,gmode="graph",label=senators,label.cex=.5,vertex.cex=.75,edge.lwd=.8,edge.col="lightblue", main=paste(years[i,1]," q",j,sep=""), vertex.border="red")
		dev.off()
		}
	}
}	


## Major Component ##

for(i in 1:nrow(years)){
	for(j in 1:4){
		netij <- nets[[i]][[j]]
		nr <- degree(reachability(netij))
		nr0 <- which(nr < 25)
		netij[nr0,] <- 0
		netij[,nr0] <- 0
		degn <- degree(netij)
		if(sum(netij)>0){
		set.seed(5)
		fileij <- paste("./Plots/major/y",years[i,1],"q",j,".pdf",sep="")
		pdf(fileij,height=4, width=4, family="Times", pointsize=7)
		gplot(netij,displayisolates=F,gmode="graph",label=senators,label.cex=.5,vertex.cex=.75,edge.lwd=.8,edge.col="lightblue", main=paste(years[i,1]," q",j,sep=""), vertex.border="red")
		dev.off()
		}
	}
}	

###### Comparison with Fowler ########

## Add centrality to Fowler's Data ##

nc <- rep(NA,nrow(fowler))
ec <- rep(NA,nrow(fowler))
bc <- rep(NA,nrow(fowler))
dc <- rep(NA,nrow(fowler))

fowler <- read.csv("SH.csv",stringsAsFactors=F)
fowler <- subset(fowler,fowler[,2]=="S")
fowid <- paste(snames(fowler$labels),fowler$congress,sep="")
fowidnc <- paste(snamesnc(fowler$labels),fowler$congress,sep="")
for(i in 1:13){
library(igraph)
net <- cnets[[i]]
active <- match(sn_i[[i]],senators)
net <- net[active,active]
distnet <- graph.adjacency(1/net,weighted=T)
sp <- shortest.paths(distnet,algorithm="dijkstra")
csp <- c(sp)
csp_inf <- which(csp==Inf)
csp[csp_inf] <- 1.25*max(csp[-csp_inf])
msp <- matrix(csp,nrow(sp),nrow(sp)) 
new_cent <- 1/apply(msp,2,sum)
detach(package:igraph)
library(sna)
ev_cent <- evcent(net)
bet_cent <- betweenness(net)
cl_cent <- apply(net,2,sum)
peid <- paste(sn_i[[i]],congs[i],sep="")
peidnc <- paste(snamesnu(sn_i[[i]]),congs[i],sep="")
pe_fow <- match(peid,fowid)
pe_fownc <- match(peidnc,fowidnc)
pe_fow[which(is.na(pe_fow))] <- pe_fownc[which(is.na(pe_fow))]
naid <- which(!is.na(pe_fow))
pe_fow <- pe_fow[naid]
nc[pe_fow] <- new_cent[naid]
ec[pe_fow]<- ev_cent[naid]
bc[pe_fow] <- bet_cent[naid]
dc[pe_fow] <- cl_cent[naid]
}

fowler <- data.frame(fowler,nc,ec,bc,dc)

fow_prev <- matrix(NA,nrow(fowler),ncol(fowler))
fowpid <- paste(snames(fowler$labels),fowler$congress+1,sep="")
fowlinfow <- match(fowpid,fowid)
fowpidnc <- paste(snamesnc(fowler$labels),fowler$congress+1,sep="")
fowlinfow[which(is.na(fowlinfow))] <- match(fowpidnc,fowidnc)[which(is.na(fowlinfow))]
naid <- which(!is.na(fowlinfow))
for(i in 1:length(naid)){
fow_prev[fowlinfow[naid[i]],c(1,4:27)] <- as.numeric(fowler[naid[i],c(1,4:27)])
}
fow_prev <- data.frame(fow_prev)
names(fow_prev) <- paste("l_",names(fowler),sep="")

fowall <- data.frame(fowler,fow_prev[,4:27])

nomis <- which(complete.cases(fowall[,c("pa","l_pa","l_betweeness","l_evcent","l_closeness","l_connectedness","l_nc","l_ec","l_bc","l_dc")]))

cfowall <- fowall[nomis,]
cfowall <- subset(cfowall,cfowall$congress>97)

# Fowler's data covers 93-108
# Ours, 96-110 excluding 106 and 107 
# Data cover 96-108 less 2 - 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 108
# In forecasting analysis we have 97, 98, 99, 100, 101, 102, 103, 104, 105, 106 (all repeat)

lag <- glm.nb(pa~l_pa, data=cfowall)
allcos <- glm.nb(pa~l_pa+l_betweeness+l_evcent+l_closeness+l_connectedness, data=cfowall)
allspe <- glm.nb(pa~l_pa+l_nc+l_ec+l_bc+l_cc, data=cfowall)
full <- glm.nb(pa~l_pa+l_betweeness+l_evcent+l_closeness+l_connectedness+l_nc+l_ec+l_bc+l_dc, data=cfowall,x=T)

bestmod <- stepAIC(full)
bestmodb <- stepAIC(full,k=log(nrow(cfowall)))


## Bootstrap Full ##

inbest <- matrix(0,100,length(coef(full)))
inbestb <- matrix(0,100,length(coef(full)))

for(i in 1:100){
cfowalli <- cfowall[sample(1:nrow(cfowall),nrow(cfowall),rep=T),]

fulli <- glm.nb(pa~l_pa+l_betweeness+l_evcent+l_closeness+l_connectedness+l_nc+l_ec+l_bc+l_dc, data=cfowalli)

bestmodi <- stepAIC(fulli)
bestmodbi <- stepAIC(fulli,k=log(nrow(cfowall)))
inbest[i,match(names(coef(bestmodi)),names(coef(full)))] <- 1
inbestb[i,match(names(coef(bestmodbi)),names(coef(full)))] <- 1
print(i)
}

## Log Score Full ##

ucong <- unique(cfowall$congress)
lscores <- matrix(NA,8,8)
ind <- 1
contrib <- NULL
for(out in 1:8){
	preds <- as.logical(rep(1,10))
	if(out > 0){
		preds[out+2] <- F
		}
	for(t in ucong[3:length(ucong)]){
		Xt <- full$x[which(cfowall$congress<t & cfowall$congress>97),preds]
		yt <- full$y[which(cfowall$congress<t & cfowall$congress>97)]
		Xv <- full$x[which(cfowall$congress==t),preds]
		yv <- full$y[which(cfowall$congress==t)]
		train <- glm.nb(yt~Xt-1)
		lscores[t-ucong[2],out] <- sum(log(dnbinom(yv,mu=exp(Xv%*%cbind(coef(train))),size=train$theta)))
		}
	}
	
## Full Model ##
preds <- as.logical(rep(1,10))
full_fit <- NULL
	for(t in ucong[3:length(ucong)]){
		Xt <- full$x[which(cfowall$congress<t & cfowall$congress>97),preds]
		yt <- full$y[which(cfowall$congress<t & cfowall$congress>97)]
		Xv <- full$x[which(cfowall$congress==t),preds]
		yv <- full$y[which(cfowall$congress==t)]
		train <- glm.nb(yt~Xt-1)
		full_fit <- c(full_fit,sum(log(dnbinom(yv,mu=exp(Xv%*%cbind(coef(train))),size=train$theta))))
		}

# One year prediction

ucong <- unique(cfowall$congress)
lscores1 <- matrix(NA,8,8)
ind <- 1
contrib <- NULL
for(out in 1:8){
	preds <- as.logical(rep(1,10))
	if(out > 0){
		preds[out+2] <- F
		}
	for(t in ucong[2:length(ucong)]){
		Xt <- full$x[which(cfowall$congress==t-1),preds]
		yt <- full$y[which(cfowall$congress==t-1)]
		Xv <- full$x[which(cfowall$congress==t),preds]
		yv <- full$y[which(cfowall$congress==t)]
		train <- glm.nb(yt~Xt-1)
		lscores1[t-ucong[2],out] <- sum(log(dnbinom(yv,mu=exp(Xv%*%cbind(coef(train))),size=train$theta)))
		}
	}

## Full Model ##
preds <- as.logical(rep(1,10))
full_fit1 <- NULL
	for(t in ucong[3:length(ucong)]){
		Xt <- full$x[which(cfowall$congress==t-1),preds]
		yt <- full$y[which(cfowall$congress==t-1)]
		Xv <- full$x[which(cfowall$congress==t),preds]
		yv <- full$y[which(cfowall$congress==t)]
		train <- glm.nb(yt~Xt-1)
		full_fit1 <- c(full_fit1,sum(log(dnbinom(yv,mu=exp(Xv%*%cbind(coef(train))),size=train$theta))))
		}

### Plotting lscores less full fit ###
# l_betweeness+l_evcent+l_closeness+l_connectedness+l_nc+l_ec+l_bc+l_dc

effnames <- c("BC_C","EC_C","DC_C","NC_C","NC_P","EC_P","BC_P","DC_P")
fit_dif <- NULL
for(i in 1:8){
	fit_dif <- cbind(fit_dif,lscores[,i]-full_fit)
	}
lty <- c(2,2,2,2,1,1,1,1)
col <- c("black","blue","red","green","green","blue","black","red")
pdf("./Plots/logscore.pdf",height=4, width=6, family="Times", pointsize=10)
par(las=1,mar=c(4,4,1,1))
plot(99:106,fit_dif[,1],col=col[1],ylim=c(min(c(fit_dif)),max(c(fit_dif))))
for(i in 1:8){
	points(99:106,fit_dif[,i],col=col[i])
	lines(99:106,fit_dif[,i],col=col[i],lty=lty[i],lwd=2)
	}
	abline(h=0,col="grey55")
	legend("top",legend=effnames,pch=21,lty=lty,col=col)
dev.off()

central <- full$x[,3:10]
colnames(central) <- effnames
cor(central)

 plot(central[,c(3,8)])
 text(x=central[,c(3)],y=central[,8],labels=paste(cfowall$labels,cfowall$congress), cex=.5)

write.dta(cfowall,"cosp_prev.dta")